Author: Madalina-Alina Racovita, 1st year master's student on Computational Optimization at Faculty of Computer Science, UAIC, Iasi, Romania

!pip install researchpy
!pip install pydotplus
!pip install xgboost
!pip install imblearn
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import numpy as np
import researchpy as rp
import pydotplus
from scipy.stats import chi2_contingency
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
os.environ['PATH'] = os.environ['PATH'] + ';' + os.environ['CONDA_PREFIX'] + r"\Library\bin\graphviz"
warnings.filterwarnings('ignore')
matplotlib.style.use('ggplot')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
The dataframes are going to be loaded in an unprocessed version since, for this task the feature selection has to be made using empirical and encapsulated manners. It is specified the fact that the features UnitsInBuilding and Stories are going to be removed from the dataframe, the classification task is going to be pursued only of the real estates geographically localized in the Washington state.
os.listdir('./../Data')
df_rcon = pd.concat([pd.read_csv("./../Data/RCON_12011.assessor.tsv", sep = "\t"),
pd.read_csv("./../Data/RCON_53033.assessor.tsv", sep = "\t")])
df_rsfr = pd.concat([pd.read_csv("./../Data/RSFR_12011.assessor.tsv", sep = "\t"),
pd.read_csv("./../Data/RSFR_53033.assessor.tsv", sep = "\t")])
df_rcon['isRCON'] = 1
df_rsfr['isRCON'] = 0
df = pd.concat([df_rcon, df_rsfr])
del df['UnitsInBuilding']
del df['Stories']
df = df[df['State'] == 'WA']
The columns that have a single value in the entire dataframe are going to be dropped since they have no predictive value. Their names were detected during EDA.
columns_constant_features = ['AtticSqft', 'IsFixer', 'GarageNoOfCars', 'EffectiveYearBuilt', 'TotalRooms', 'State']
for current in columns_constant_features:
del df[current]
The features that have a missing percentage greater than 95% are going to be dropped, since tey have no predictive value and increase the model complexity with no reason.
column_miss_perc_ge_95 = ['DeedType', 'RoofCode', 'BuildingShapeCode', 'City', 'StructureCode']
for column in column_miss_perc_ge_95:
del df[column]
The missing values for the object type columns are going to be replaced with an empty string, the ones for numerical columns with 0, except from SellPrice which is going to be replaced with the mean corresponding to the real estates category from which is belonging: residential or RCON.
features_with_missing_values = ['BuildingCode', 'GarageCarportCode', 'PatioPorchCode', 'PoolCode', 'Zonning', \
'PropTaxAmount', 'FoundationCode', 'ExteriorCode', 'CoolingCode', 'HeatingCode', \
'HeatingSourceCode', 'View', 'DocType', 'TransType', 'DistressCode', 'SellPrice']
object_miss_values_features = []
numeric_miss_values_features = []
types = df[features_with_missing_values].dtypes
for i in range(len(types)):
if types[i] == object:
object_miss_values_features.append(features_with_missing_values[i])
else:
numeric_miss_values_features.append(features_with_missing_values[i])
for column in object_miss_values_features:
df[column] = df[column].fillna('')
rcon_medium_sellprice = df_rcon['SellPrice'].mean()
rsfr_medium_sellprice = df_rsfr['SellPrice'].mean()
for column in numeric_miss_values_features:
if column != 'SellPrice':
df[column] = df[column].fillna(0)
prices = []
for (index, row) in df.iterrows():
added = False
if pd.isnull(row['SellPrice']):
row['SellPrice'] = row['LastSalePrice']
if row['SellPrice'] == 0:
if row['isRCON'] == 1:
prices.append(rcon_medium_sellprice)
else:
prices.append(rsfr_medium_sellprice)
elif row['SellPrice'] != 0:
prices.append(row['SellPrice'])
df['SellPrice'] = prices
df.loc[(df['isRCON'] == 1) & (df['LastSalePrice'] == 0), 'LastSalePrice'] = rcon_medium_sellprice
df.loc[(df['isRCON'] == 0) & (df['LastSalePrice'] == 0), 'LastSalePrice'] = rsfr_medium_sellprice
bool_type_columns = df.select_dtypes(include=bool).columns.tolist()
for column in bool_type_columns:
df[column] = df[column].apply(lambda x: 0 if x == False else 1)
df.head()
The real estates with a price bigger than 2.5 million US dollars are going to be dropped since they are outliers and we want a robust model, which will learn generalities not particularities.
df = df.drop(df[df['SellPrice'] > 2500000].index)
Some features are going to be rebuilt for increasing their predictive power.
df['SellDate_Year'] = df['SellDate'].apply(lambda x: int(x[:4]))
df['SellDate_Month'] = df['SellDate'].apply(lambda x: int(x[5:7]))
df['SellDate_Day'] = df['SellDate'].apply(lambda x: int(x[8:]))
del df['SellDate']
df['StatusDate'] = df['StatusDate'].apply(lambda x: str(x.split()[0]))
df['SellPricePerLivingSqft'] = df['SellPrice'] / df['LivingSqft']
Before constructing a Machine Learning model it is important to filter the features, since the features uncorrelated with the target class, complicate useless the model and increase the computational complexity. It is desirable a model with as few features as possible, robust and with a great predictive power.
For instance, it can be tolerated a small decrease of acccuracy, if the number of features used in the second model is a lot smaller.
The numerical features are going to be filtered based on the Pearson's correlation coefficient with the target class, isRCON.
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(30, 16))
sns.heatmap(df.corr(), annot=True, fmt=".2f", mask=mask, cmap='magma')
plt.show()
Are going to be kept the features with a correlation coefficient with the isRCON variable greater than 0.08.
relevant_numerical_features = []
for key in corr['isRCON'].keys():
if abs(corr['isRCON'][key]) > 0.08 and key != 'isRCON':
relevant_numerical_features.append(key)
relevant_numerical_features
str(len(corr['isRCON']) - len(relevant_numerical_features)) + ' numerical features were removed due to small correlation from a ' \
+ 'total of ' + str(len(corr['isRCON'])) + ' numerical features'
Let's take a glance on the features that were eliminated.
list(set(corr['isRCON'].keys()).difference(set(relevant_numerical_features)))
The categorical features are going to be filterd as well, but before that we have to collect their names.
character_type_features = []
types = df.dtypes
for i in range(len(types)):
if types[i] == object:
character_type_features.append(list(df)[i])
len(character_type_features)
The categorical features are going to be filtered by pursuing some chi-squared tests of independence. The 'so-said' correlation between isRCON and each categorical predictor is going to be measured by the Cramer's V value.
For this task the Chi-square test of independence is going to be used. Chi-square test of independence checks if there is a relationship between two nominal variables. We are considering for instance, two relevant categorical variables: Quality and isRCON, i.e. is going to be tested if the quality of the buildings is higher or lower depending on the real estate type.
The H0 (Null Hypothesis): There is no relationship between Quality and variable isRCON.
The H1 (Alternative Hypothesis): There is a relationship between Quality and isRCON.
If the p-value is significant (as close as possible to 0, preferable smaller than 0.05), you can reject the null hypothesis and claim that the findings support the alternative hypothesis.
contingency_table = pd.crosstab(df['Quality'], df['isRCON'])
contingency_table
contingency_table, independence_test_results = rp.crosstab(df['Quality'], df['isRCON'], prop='col', test='chi-square')
independence_test_results
Cramér's V (sometimes referred to as Cramér's phi and denoted as φc) is a measure of association between two nominal variables.
def cramers_v(confusion_matrix):
""" calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
chi2 = chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
def get_cramers_v_for_given_features(feature1, feature2):
confusion_matrix = pd.crosstab(df[feature1], df[feature2]).as_matrix()
return cramers_v(confusion_matrix)
get_cramers_v_for_given_features('Quality', 'isRCON')
There are going to be kept the features with a Cramer's V value greater than 0.2.
relevant_categorical_features = []
for nominal_feature in character_type_features:
cramers_v_value = get_cramers_v_for_given_features(nominal_feature, 'isRCON')
if cramers_v_value > 0.2:
relevant_categorical_features.append(nominal_feature)
print(nominal_feature + ' & isRCON Cramer\'s V = ' + str(cramers_v_value))
The features with a Cramer's V value bigger than 0.2 are going to be kept for the further predictive tasks.
relevant_categorical_features
relevant_features = relevant_numerical_features + relevant_categorical_features
len(relevant_features)
Therefore, after the filtration based on the Pearson's correlation coefficient and on Cramer's V values 31 features were kept as prectors for the ML model.
The dataframe is going to be encoded so that the categorical features are going to be transformed into numerical ones, by a simmilar One-Hot encoding process. The numerical variables are going to be kept unmodified.
encoded_df = pd.get_dummies(df[relevant_features])
encoded_df['isRCON'] = df['isRCON']
len(list(encoded_df))
encoded_df.head(3)
Since the number of features from the encoded dataframe is quite big and this will increase the complexity of the computations for the ML models, I am going to filter as well these features, by removing the ones with a Pearson's correlation coefficient smaller than 0.08, analogically with the previous filtration step on the initial dataframe.
corr = encoded_df.corr()
relevant_features_from_encoded_df = []
for key in corr['isRCON'].keys():
if abs(corr['isRCON'][key]) > 0.08 and key != 'isRCON':
relevant_features_from_encoded_df.append(key)
len(relevant_features_from_encoded_df)
From a total of 479 variables, 75 are going to be kept for model building.
X = encoded_df[relevant_features_from_encoded_df]
y = df['isRCON']
Imbalanced datasets are those where there is a severe skew in the class distribution. This problem can be solved with resampling strategies. Resampling involves creating a new transformed version of the training dataset in which the selected examples have a different class distribution.
df['isRCON'].value_counts()
Since the number of residential properties is bigger than the number of RCON real estates, it is possible to have some problems caused by the unbalanced dataset. I am going to build a prototype classification model to see how the imbalanced class affects the performance. As performance metrics, there are going to be analized the confusion matrix, the precision, recall and accuracy values. The confusion matrix that has the following structure:

${\displaystyle {\textbf{Precision}}={\frac {tp}{tp+fp}}\,}$
${\displaystyle {\textbf{Recall}}={\frac {tp}{tp+fn}}\,}$
def get_performance_results_for_model(model, y_test, y_pred):
print("Accuracy:", metrics.accuracy_score(y_test, y_pred), '\n\n')
print('_________________ Confusion Matrix __________________')
conf_matrix = confusion_matrix(y_test, y_pred)
fig = plt.figure(figsize=(7,2))
sns.heatmap(conf_matrix.T, annot=True, fmt='d',
cmap='BuPu', cbar=False,
xticklabels=list(set(y_test)),
yticklabels=list(set(y_test)))
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()
print('_______________ Classification Report _______________\n\n',
classification_report(y_test, y_pred))
def build_1nn_model(x_df, target):
# train-test split: test dataframe will have 30% from the entire dataframe
X_train, X_test, y_train, y_test = train_test_split(x_df, target, test_size=0.3, random_state=1)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
pred = cross_val_predict(knn, X_test, y_test, cv=5)
print('_____________________ 1-NN _______________________\n')
get_performance_results_for_model(knn, y_test, pred)
return knn
knn_model_imbalanced_df = build_1nn_model(X, y)
As it can be seen the model overlearns the majority clas, and the minority class is not predicted that well due to a lot smaller precision and recall metrics.
Random oversampling duplicates examples from the minority class in the training dataset. Downside: overfitting for some models. Random undersampling deletes examples from the majority class. Downside: losing information invaluable to a model.
Since the number of residential estates is bigger and represents 5/6 from the entire dataset, unsampling won't be chosen as a strategy for imbalanced class problem.
from imblearn.over_sampling import RandomOverSampler
print('Before oversampling: ', Counter(y))
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print('After oversampling:', Counter(y_over))
knn_model_after_oversampling = build_1nn_model(X_over, y_over)
After oversampling it can be seen that the global accuracy was increased, and the precision and recall metrics are balanced, by not existing a big discrepancy between classes, therefore resulting the increased predictability.
X = X_over
y = y_over

Stratification is the process of rearranging the data so as to ensure that each fold is a good representative of the whole. For example, in a binary classification problem where each class comprises of 50% of the data, it is best to arrange the data such that in every fold, each class comprises of about half the instances.
It is generally a better approach when dealing with both bias and variance. A randomly selected fold might not adequately represent the minor class, particularly in cases where there is a huge class imbalance.

skf = StratifiedKFold(n_splits=5, random_state=None)
fold_index = 1
for train_index, val_index in skf.split(X,y):
X_train, X_test = X.iloc[train_index], X.iloc[val_index]
y_train, y_test = y.iloc[train_index], y.iloc[val_index]
print('Train ' + str(fold_index) + 'th fold y distribution: ', Counter(y_train))
print('Test ' + str(fold_index) + 'th fold y distribution: ', Counter(y_test))
print('___________________________________________________________')
fold_index += 1
As it can be noticed each fold has the same distribution for the target class due to the stratification. The StratifiedKFold can be performed as well using the function cross_val_predict from sklearn.model_selection with the parameter cv = integer, to specify the number of folds in a (Stratified)KFold.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = cross_val_predict(clf, X_test, y_test, cv=5)
get_performance_results_for_model(clf, y_test, y_pred)
dot_data = StringIO()
export_graphviz(clf, out_file = dot_data,
filled=True, rounded = True,
special_characters = True, feature_names = relevant_features_from_encoded_df,
class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('real_estates_dt.png')
Image(graph.create_png())
It is known that decision-tree learners can create over-complex trees that do not generalise the data well (i.e. produce overfitting). Since the accuracy was 99.8% by using unpruned decission trees, I am going to built a model with max-depth 4 to see which are the most important features. For performance criterion I am going to use the entropy, instead of gini index.
Gini impurity measures the probability of a particular variable being wrongly classified when it is randomly chosen.
Information Gain is used to determine which feature/attribute gives us the maximum information about a class. It is based on the concept of entropy, which is the degree of disorder. It aims to reduce the level of entropy starting from the root node to the leave nodes.
$ \textbf{Gini}(var) = 1 - \sum\limits_{i=1}^n {p_i}^2$, where $p_i$ is the probability of an object being classified to a particular class. The attribute/feature with the least Gini index is chosen as the root node.
$ \textbf{Entropy}(var) = \sum\limits_{i=1}^n -p_i \log_2{p_i}$
clf = DecisionTreeClassifier(criterion="entropy", max_depth=4)
clf = clf.fit(X_train,y_train)
y_pred = cross_val_predict(clf, X_test, y_test, cv=5)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
As it can be seen the accuracy is still a very good one, 98.5%. Let's take a look on the structure of the decission tree classifier.
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = relevant_features_from_encoded_df,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('real_estates_dt_maxdepth3.png')
Image(graph.create_png())
It can be seen that the missing value for HeatingCode has the biggest predictive power. It is followed by features like LandSqft, BuildingCode, YearBuilt, Zonning_SF, HeatingSourceCode, LandValue and View.
There are going to be constructured a decision tree classifier for each feature. The features corresponding to the models with best performance results are going to be selected.
def get_accuracy_for_dt_classifier_based_on_df(X_given, y_given):
X_train, X_test, y_train, y_test = train_test_split(X_given, y_given, test_size=0.4, random_state=1)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train.values.reshape(-1,1),y_train)
y_pred = clf.predict(X_test.values.reshape(-1,1))
return metrics.accuracy_score(y_test, y_pred)
accuracies = [get_accuracy_for_dt_classifier_based_on_df(X[feature], y) for feature in list(X)]
indexes_best_accuracies = sorted(range(len(accuracies)), key=lambda i: accuracies[i], reverse=True)
There are going to be kept the predictors that have a bit bigger predictive value than random guessing.
features_best_predictive_value_dtc = []
for index_feature in indexes_best_accuracies:
if accuracies[index_feature] > 0.52:
features_best_predictive_value_dtc.append(list(X)[index_feature])
len(features_best_predictive_value_dtc)
print('Predictors with biggest predictive value: \n\n', features_best_predictive_value_dtc[:10])
print('Predictors that are going to be eliminated after decission tree selection: \n\n',
list(set(list(X)).difference(set(features_best_predictive_value_dtc))))
X = X[features_best_predictive_value_dtc]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

error_rate = []
for i in range(1, 30):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_pred = cross_val_predict(knn, X_test, y_test, cv=5)
error_rate.append(np.mean(y_pred != y_test))
plt.figure(figsize=(15,6))
plt.plot(range(1,30), error_rate, color='green',
linestyle='dashed', marker='o',
markerfacecolor='red', markersize=4)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
best_k = np.argmin(error_rate) + 1
best_k
worst_k = np.argmax(error_rate) + 1
worst_k
knn = KNeighborsClassifier(n_neighbors = best_k)
knn.fit(X_train,y_train)
y_pred = cross_val_predict(knn, X_test, y_test, cv=5)
get_performance_results_for_model(knn, y_test, y_pred)
knn = KNeighborsClassifier(n_neighbors = worst_k)
knn.fit(X_train,y_train)
y_pred = cross_val_predict(knn, X_test, y_test, cv=5)
get_performance_results_for_model(knn, y_test, y_pred)
The differences are notable, the accuracy for 1-NN is bigger, i.e. 93.7% comparing to 87.6% as well as the confusion matrix result long with presission and recall for both classes.

Bayes formula with the presumption of mutual independence between features: $P(Y | X_1, X_2, ...., X_n) = \frac{P(Y) \prod\limits_{i=1}^n P(X_i | Y)}{P(X_1, X_2, ...., X_n)} $
Bayes Naive tries to find: $\widehat{Y} = argmax_y P(Y) \prod\limits_{i=1}^n P(X_i | Y)$
The Gaussian Naive Bayes was chosen due to the continuous nature of some features. This type of NB assumes that the likelihood of the features is Gaussian: $P(X_i | Y) = \frac{1}{\sqrt{2\pi {\sigma(Y)}^2}} exp(- \frac{(X_i-\mu(Y))^2}{2\sigma(Y)}^2) $
from sklearn.naive_bayes import GaussianNB
gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train, y_train)
y_pred = cross_val_predict(gaussian_nb, X_test, y_test, cv=5)
print('______________ Gaussian Naive Bayes _______________\n')
get_performance_results_for_model(gaussian_nb, y_test, y_pred)

from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-4,
hidden_layer_sizes=(100, 2),
random_state=1)
clf.fit(X_train, y_train)
y_pred = cross_val_predict(clf, X_test, y_test)
get_performance_results_for_model(clf, y_test, y_pred)
As it can be seen the neural network performs awful, since it classifies every instance as being RCON.
I am going to modify the configuration of the Neural Network, scaling the training date in a previous step.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_data_train = sc.fit_transform(X_train)
scaled_data_test = sc.fit_transform(X_test)
clf = MLPClassifier(solver='adam', activation='tanh',
alpha=1e-4, #regularization parameter for L2
hidden_layer_sizes=(100, 2),
early_stopping=True,
random_state=1)
clf.fit(scaled_data_train, y_train)
y_pred = cross_val_predict(clf, scaled_data_test, y_test)
get_performance_results_for_model(clf, y_test, y_pred)
The improvements are more than clear. I used 'tanh' due to the problems that logistic function have (vanishing gradients, gradient equat to 0 on $(-\infty, -4) and (4, +\infty) due to asimptotical nature of sigmoid function). I didn't used relu due to the problem of exploding gradients that can arise. I used ADAM (Adaptive Momentum) as optimiser since it is based on ADAGRAD (Adaptive gradient) and RMSPROP (Root Mean Square Propagation) optimisers.
I am going to try the same configuration on a neural network with more than 1 hidden layer to see how it performs. I would decrease the number of neurons from a layer to another to form that bottle neck effect.
len(list(X))
The input layer will be formed of 50 neurons.
clf = MLPClassifier(solver='adam', activation='tanh',
alpha=1e-4, #regularization parameter for L2
hidden_layer_sizes=(150, 50, 10, 2),
early_stopping=True,
random_state=1)
clf.fit(scaled_data_train, y_train)
y_pred = cross_val_predict(clf, scaled_data_test, y_test)
get_performance_results_for_model(clf, y_test, y_pred)
The differences between accuracies are not that notable, more exactly 3e-05. The simpler architecture performs better.

The implementation for SVM from sklearn is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets it is sugested to be used sklearn.svm.LinearSVC or _sklearn.linearmodel.SGDClassifier instead, possibly after a _sklearn.kernelapproximation.Nystroem transformer.
LinearSVC is similar to SVC(Support Vector Classifier) with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.
The Nystroem method is a general method for low-rank approximations of kernels. It achieves this by essentially subsampling the data on which the kernel is evaluated. By default Nystroem uses the rbf kernel, but it can use any kernel function or a precomputed kernel matrix.
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC
kernel_types = ['linear', 'poly', 'rbf', 'sigmoid']
def build_linersvc_with_nystroem_approx(kernel_type, verbose=False):
clf = LinearSVC()
feature_map_nystroem = Nystroem(kernel=kernel_type,
gamma=.4,
random_state=1,
n_components=200)
data_train_transformed = feature_map_nystroem.fit_transform(X_train)
clf.fit(data_train_transformed, y_train)
data_test_transformed = feature_map_nystroem.fit_transform(X_test)
y_pred = cross_val_predict(clf, data_test_transformed, y_test)
print('Accuracy for Linear SVC with ' + kernel_type + ' kernel: ', metrics.accuracy_score(y_test, y_pred))
if verbose == True:
get_performance_results_for_model(clf, y_test, y_pred)
return clf
for kernel_type in kernel_types:
build_linersvc_with_nystroem_approx(kernel_type, verbose=False)
Since the best kernel was the linear one, I am going to build a model but only using LinearSVC, without kernel approximation.
The defalt loss function for LinearSVC is squared_hinge, the penalty term is L2 (penalty with euclidian norm), and dual=True, which will imply that the algorithm will solve the dual optimisation problem. Let's see how the algorithm performs with the default configuration.
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
y_pred = cross_val_predict(clf, X_test, y_test)
get_performance_results_for_model(clf, y_pred, y_test)
I am going to change the penalty term to L1, due to the sparsity among feature values, and I will set dual to False since the number_of_features > n_samples. I will set the tolerance to a smaller value, defalt is 1e-4, I will set it to 1e-7.
clf = LinearSVC(random_state=0, penalty='l1', dual=False, tol=1e-7)
clf.fit(X_train, y_train)
y_pred = cross_val_predict(clf, X_test, y_test)
get_performance_results_for_model(clf, y_pred, y_test)
The improvements are more than obvious, so that with the new configuration the global accuracy reached the value of 97.5%.
from sklearn.feature_selection import SelectFromModel #to select the features with non-zero coeficients
clf = LinearSVC(random_state=0, penalty='l1', dual=False, tol=1e-7)
clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_feature_selected = model.transform(X)
feature_idx = model.get_support()
feature_names = X.columns[feature_idx]
len(feature_names)
clf = LinearSVC(random_state=0, penalty='l1', dual=False, tol=1e-7)
clf.fit(X_train[feature_names], y_train)
y_pred = cross_val_predict(clf, X_test[feature_names], y_test)
get_performance_results_for_model(clf, y_pred, y_test)


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, random_state = 42)
rf.fit(X_train, y_train);
y_pred = rf.predict(X_test)
y_pred = cross_val_predict(rf, X_test, y_test, cv=5)
get_performance_results_for_model(rf, y_test, y_pred)
feature_imp = pd.Series(rf.feature_importances_, index=list(X)).sort_values(ascending=False)
feature_imp[:10]
fig = plt.figure(figsize=(13,13))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()
There are going to be selected the features which are most important in order to build a simplified classification model.
most_important_features = []
for key in feature_imp.keys():
if feature_imp[key] > 0.01:
most_important_features.append(key)
len(most_important_features)
rf_most_important = RandomForestClassifier(n_estimators = 10, random_state = 42)
rf_most_important.fit(X_train[most_important_features], y_train);
y_pred = cross_val_predict(rf_most_important, X_test[most_important_features], y_test, cv=5)
errors = abs(y_pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 6))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
The difference between accuracies are not that consistent, i.e. 1e-03, the second model having an accuracy of 99.6% and the first one 99.7%. Therefore I am going to use the simplified model for the tunning stage in which we will search for the proper number of estimators.
tree = rf_most_important.estimators_[0]
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,
feature_names = most_important_features,
class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('random_forest_first_estimator.png')
Image(graph.create_png())